1. Load Library and Data

library(tidyverse)
library(DESeq2)
library(dplyr)
library(ggrepel)
library(factoextra)

Load the meta data & gene counts:

meta <- read.csv('colData.csv')
meta <- meta[,(2:3)]
df <- read.csv('countData.csv')
rownames(df) <- df$X
df <- df[,(2:20)]

Import gene name mapping:

gene_mapping <- read.csv('gene_mapping.csv')

Check if all samples in df have its corresponding information in meta:

all(colnames(df) %in% meta$Run)
## [1] TRUE
colSums(df)
## SRR1027182 SRR1027186 SRR1027185 SRR1027177 SRR1027181 SRR1027175 SRR1027180 
##    5218431    6326462    6290938   11938349    7849877    5254143    6859199 
## SRR1027184 SRR1027190 SRR1027188 SRR1027176 SRR1027189 SRR1027174 SRR1027179 
##    8743762   15664469   12524362    5809471   13555119    5361726    7561150 
## SRR1027178 SRR1027187 SRR1027171 SRR1027173 SRR1027183 
##    7219770    2884803    7691098    6226861    6213487
median(colSums(df))
## [1] 6859199
dim(df)
## [1] 58735    19
# plot variance against mean
mean_counts <- apply(df[, colnames(df) %in% c('SRR1027186', 'SRR1027185', 'SRR1027184', 'SRR1027187', 'SRR1027183')], 1, mean)
variance_counts <- apply(df[, colnames(df) %in% c('SRR1027186', 'SRR1027185', 'SRR1027184', 'SRR1027187', 'SRR1027183')], 1, var)
mean_var <- data.frame(mean_counts, variance_counts)

ggplot(mean_var) +
        geom_jitter(aes(x=mean_counts, y=variance_counts)) + 
        geom_line(aes(x=mean_counts, y=mean_counts, color="red")) +
        scale_y_log10() +
        scale_x_log10()
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous x-axis

2. Differential expression analysis using DESeq2

dds <- DESeqDataSetFromMatrix(countData = df, colData = meta, design = ~ Condition)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors

Remove genes with 0 counts across all samples:

keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep,]
dim(dds)
## [1] 42617    19

Conduct DESeq:

dds_p <- DESeq(dds, test = c('Wald'))
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing

2.1 Common DEGs in ALL cancer types compared against healthy samples

# result of TNBC vs normal
res_TN <- results(dds_p, contrast = c('Condition', 'TNBC', 'H'))
res_TN
## log2 fold change (MLE): Condition TNBC vs H 
## Wald test p-value: Condition TNBC vs H 
## DataFrame with 42617 rows and 6 columns
##                  baseMean log2FoldChange     lfcSE      stat      pvalue
##                 <numeric>      <numeric> <numeric> <numeric>   <numeric>
## ENSG00000000003  311.0260      -1.141975  0.471351 -2.422770 1.54027e-02
## ENSG00000000005   13.1659       0.686549  1.028942  0.667238          NA
## ENSG00000000419  365.0482       0.675489  0.442766  1.525610 1.27107e-01
## ENSG00000000457  230.5482       1.458280  0.309078  4.718161 2.37986e-06
## ENSG00000000460  139.1937       1.803008  0.358960  5.022871 5.09046e-07
## ...                   ...            ...       ...       ...         ...
## ENSG00000285987  3.788509        4.24419   1.36330   3.11318 1.85085e-03
## ENSG00000285991  4.301967        1.78685   1.00742   1.77368 7.61153e-02
## ENSG00000285992  0.741578        3.97755   2.13154   1.86604 6.20354e-02
## ENSG00000285993  2.754683        5.69246   1.84344   3.08795 2.01542e-03
## ENSG00000285994 14.007356        4.27256   1.04174   4.10139 4.10681e-05
##                        padj
##                   <numeric>
## ENSG00000000003 4.03880e-02
## ENSG00000000005          NA
## ENSG00000000419 1.98492e-01
## ENSG00000000457 3.88334e-05
## ENSG00000000460 1.04108e-05
## ...                     ...
## ENSG00000285987 0.008005325
## ENSG00000285991 0.132732500
## ENSG00000285992 0.113661201
## ENSG00000285993 0.008583923
## ENSG00000285994 0.000408147

Sort the result table by log2FoldChange and filter out any genes with padj > 0.1 (FDR):

resTN_ord_log2 <- res_TN[order(abs(res_TN$log2FoldChange), decreasing = TRUE),] # sort the genes according to descending log2FoldChange
resTN_ord_log2 <- resTN_ord_log2[!is.na(resTN_ord_log2$padj), ] # exclude gene with padj as NA
resTN_ord_log2 <- resTN_ord_log2[resTN_ord_log2$padj < 0.1, ]
resTN_ord_log2
## log2 fold change (MLE): Condition TNBC vs H 
## Wald test p-value: Condition TNBC vs H 
## DataFrame with 20043 rows and 6 columns
##                  baseMean log2FoldChange     lfcSE      stat      pvalue
##                 <numeric>      <numeric> <numeric> <numeric>   <numeric>
## ENSG00000256618 2187.5780        24.4064   2.53567   9.62522 6.25739e-22
## ENSG00000238554   28.3302        21.3536   2.56484   8.32551 8.39633e-17
## ENSG00000278996   44.0715        21.0402   3.26819   6.43786 1.21170e-10
## ENSG00000107807   19.8675        20.7398   2.40508   8.62337 6.50146e-18
## ENSG00000184330   15.8588        18.8164   2.12028   8.87449 7.02545e-19
## ...                   ...            ...       ...       ...         ...
## ENSG00000158290  1037.178       0.463707  0.234655   1.97612   0.0481408
## ENSG00000164168   343.789       0.455660  0.229441   1.98596   0.0470383
## ENSG00000116001   856.906       0.448834  0.229966   1.95174   0.0509686
## ENSG00000131269   245.552      -0.411551  0.191178  -2.15271   0.0313418
## ENSG00000275052  1251.051       0.409573  0.205924   1.98895   0.0467066
##                        padj
##                   <numeric>
## ENSG00000256618 2.48881e-19
## ENSG00000238554 1.33582e-14
## ENSG00000278996 6.23534e-09
## ENSG00000107807 1.28624e-15
## ENSG00000184330 1.73066e-16
## ...                     ...
## ENSG00000158290   0.0938363
## ENSG00000164168   0.0922192
## ENSG00000116001   0.0978497
## ENSG00000131269   0.0682218
## ENSG00000275052   0.0917858

Apply the same methods to get the comparison result between the other pairs:

  • HER2 vs Normal
  • NTNBC vs Normal
# HER2 VS NORMAL
res_HN <- results(dds_p, contrast = c('Condition', 'HER2', 'H'))

resHN_ord_log2 <- res_HN[order(abs(res_HN$log2FoldChange), decreasing = TRUE),] # sort the genes according to descending log2FoldChange
resHN_ord_log2 <- resHN_ord_log2[!is.na(resHN_ord_log2$padj), ] # exclude gene with padj as NA
resHN_ord_log2 <- resHN_ord_log2[resHN_ord_log2$padj < 0.1, ]
dim(resHN_ord_log2)
## [1] 15032     6
dim(res_HN)
## [1] 42617     6
# NON-TNBC VS NORMAL
res_NN <- results(dds_p, contrast = c('Condition', 'NTNBC', 'H'))

resNN_ord_log2 <- res_NN[order(abs(res_NN$log2FoldChange), decreasing = TRUE),] # sort the genes according to descending log2FoldChange
resNN_ord_log2 <- resNN_ord_log2[!is.na(resNN_ord_log2$padj), ] # exclude gene with padj as NA
resNN_ord_log2 <- resNN_ord_log2[resNN_ord_log2$padj < 0.1, ]
dim(resNN_ord_log2)
## [1] 15146     6
dim(res_NN)
## [1] 42617     6
hist(res_TN$pvalue)

use <- res_TN$baseMean > metadata(res_TN)$filterThreshold
h1 <- hist(res_TN$pvalue[!use], breaks=0:50/50, plot=FALSE)
h2 <- hist(res_TN$pvalue[use], breaks=0:50/50, plot=FALSE)
colori <- c(`do not pass`="khaki", `pass`="powderblue")

barplot(height = rbind(h1$counts, h2$counts), beside = FALSE,
        col = colori, space = 0, main = "", ylab="frequency", xlab = 'p-value')
text(x = c(0, length(h1$counts)), y = 0, label = paste(c(0,1)),
     adj = c(0.5,1.7), xpd=NA)
legend("topright", fill=rev(colori), legend=rev(names(colori)))

W <- res_TN$stat
maxCooks <- apply(assays(dds_p)[["cooks"]],1,max)
idx <- !is.na(W)
plot(rank(W[idx]), maxCooks[idx], xlab="rank of Wald statistic", 
     ylab="maximum Cook's distance per gene",
     ylim=c(0,5), cex=.4, col=rgb(0,0,0,.3))
m <- ncol(dds)
p <- 3
abline(h=qf(.99, p, m - p))

summary(res_TN)
## 
## out of 42617 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 15620, 37%
## LFC < 0 (down)     : 4423, 10%
## outliers [1]       : 302, 0.71%
## low counts [2]     : 4132, 9.7%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
summary(res_HN)
## 
## out of 42617 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 10119, 24%
## LFC < 0 (down)     : 4913, 12%
## outliers [1]       : 302, 0.71%
## low counts [2]     : 5784, 14%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
summary(res_NN)
## 
## out of 42617 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 10846, 25%
## LFC < 0 (down)     : 4300, 10%
## outliers [1]       : 302, 0.71%
## low counts [2]     : 0, 0%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

Find common genes in the above 3 tables, i.e. genes that are differentially expressed in all 3 types of cancers:

gene_com <- intersect(intersect(rownames(resHN_ord_log2), rownames(resTN_ord_log2)), rownames(resNN_ord_log2))

# plot the normalized counts of some sample common genes
par(mfrow = c(2,2))

p1 <- plotCounts(dds_p, gene = gene_com[1], intgroup = 'Condition' )
p2 <-plotCounts(dds_p, gene = gene_com[2], intgroup = 'Condition' )
p3 <-plotCounts(dds_p, gene = gene_com[3], intgroup = 'Condition' )
p4 <-plotCounts(dds_p, gene = gene_com[4], intgroup = 'Condition' )

Plot the difference in log2FoldChange in different cancer types:

temp_TN <- as.matrix(resTN_ord_log2[rownames(resTN_ord_log2) %in% gene_com,])
temp_HN <- as.matrix(resHN_ord_log2[rownames(resHN_ord_log2) %in% gene_com,])
temp_NN <- as.matrix(resNN_ord_log2[rownames(resNN_ord_log2) %in% gene_com,])

colnames(temp_TN) <- c('baseMean_TN', 'log2FoldChange_TN', 'lfcSE_TN', 'stat_TN', 'pvalue_TN', 'padj_TN')
colnames(temp_HN) <- c('baseMean_HN', 'log2FoldChange_HN', 'lfcSE_HN', 'stat_HN', 'pvalue_HN', 'padj_HN')
colnames(temp_NN) <- c('baseMean_NN', 'log2FoldChange_NN', 'lfcSE_NN', 'stat_NN', 'pvalue_NN', 'padj_NN')

temp_1 <- merge(temp_TN, temp_HN, by = 'row.names')
rownames(temp_1) <- temp_1$Row.names

res_common_H <- merge(temp_NN, temp_1, by = 'row.names')
res_common_H <- res_common_H[, colnames(res_common_H) != 'Row.names.y']
head(res_common_H)
##         Row.names baseMean_NN log2FoldChange_NN  lfcSE_NN   stat_NN
## 1 ENSG00000000457   230.54816         1.7789001 0.2985928  5.957613
## 2 ENSG00000000460   139.19369         1.7555754 0.3478203  5.047364
## 3 ENSG00000001629  1071.61551         1.2624425 0.2731820  4.621251
## 4 ENSG00000001630    36.40235        -1.9836223 0.4497786 -4.410219
## 5 ENSG00000001631   106.07561         0.7866407 0.3355306  2.344468
## 6 ENSG00000002079    14.05838         2.6215896 0.8053547  3.255199
##      pvalue_NN      padj_NN baseMean_TN log2FoldChange_TN  lfcSE_TN   stat_TN
## 1 2.559489e-09 9.899888e-08   230.54816          1.458280 0.3090780  4.718161
## 2 4.479480e-07 1.022380e-05   139.19369          1.803008 0.3589597  5.022871
## 3 3.814326e-06 6.686130e-05  1071.61551          1.258643 0.2821740  4.460522
## 4 1.032660e-05 1.573533e-04    36.40235         -2.159271 0.4706063 -4.588275
## 5 1.905425e-02 6.305345e-02   106.07561          0.695039 0.3474565  2.000362
## 6 1.133131e-03 7.410884e-03    14.05838          4.061546 0.8119662  5.002111
##      pvalue_TN      padj_TN baseMean_HN log2FoldChange_HN  lfcSE_HN   stat_HN
## 1 2.379857e-06 3.883337e-05   230.54816          2.416723 0.3079559  7.847627
## 2 5.090464e-07 1.041078e-05   139.19369          2.873196 0.3571670  8.044407
## 3 8.176022e-06 1.093085e-04  1071.61551          1.489482 0.2822953  5.276327
## 4 4.469231e-06 6.665963e-05    36.40235         -1.735948 0.4707148 -3.687898
## 5 4.546114e-02 9.002867e-02   106.07561          1.148901 0.3474329  3.306828
## 6 5.670581e-07 1.141380e-05    14.05838          2.150937 0.8469046  2.539764
##      pvalue_HN      padj_HN
## 1 4.239833e-15 4.086684e-13
## 2 8.666433e-16 9.071446e-14
## 3 1.317991e-07 2.427178e-06
## 4 2.261139e-04 1.472663e-03
## 5 9.435881e-04 4.823708e-03
## 6 1.109274e-02 3.546241e-02
# plot the different log2FoldChange per gene
res_plot <- left_join(res_common_H, gene_mapping, by = c("Row.names" = "Gene.ID"))

# reshape res_plot for plotting
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
res_plot <- melt(res_plot[c(3, 9, 15, 21)], id.vars="Gene.Name")
res_plot$Gene.Name[is.null(res_plot$Gene.Name)] <- 'na'
res_plot<-res_plot[order(res_plot$Gene.Name),]
dim(res_common_H)
## [1] 8549   19
sum(is.na(res_plot$variable))
## [1] 0

Common genes with padj < 0.1 and LFC > 2

# get common DEGs with abs(LFC) > 2
hn_lfc <- rownames(res_HN)[rownames(res_HN) %in% gene_com & res_HN$padj < 0.1 & res_HN$log2FoldChange > 2]
tn_lfc <- rownames(res_TN)[rownames(res_TN) %in% gene_com & res_TN$padj < 0.1 & res_TN$log2FoldChange > 2]
nn_lfc <- rownames(res_NN)[rownames(res_NN) %in% gene_com & res_NN$padj < 0.1 & res_NN$log2FoldChange > 2]

deseq_lfc <- Reduce(intersect, list(hn_lfc, tn_lfc, nn_lfc))
length(deseq_lfc)
## [1] 4035
#ggplot(res_plot[83:382,], aes(x = Gene.Name, y = value, color = as.character(variable), group = variable) ) + geom_point(alpha = 0.9) + 
#    theme(axis.text.x = element_text(angle = 90, size = 8), legend.position = 'bottom')

For common genes, the log2FoldChange in general follows the same trend when compared against healthy samples.

2.2 Unique DEGs found per cancer type

deg_her2 <- rownames(resHN_ord_log2)
deg_tnbc <- rownames(resTN_ord_log2)
deg_ntnbc <- rownames(resNN_ord_log2)

length(deg_her2)
## [1] 15032
length(deg_tnbc)
## [1] 20043
length(deg_ntnbc)
## [1] 15146
head(deg_her2)
## [1] "ENSG00000256618" "ENSG00000184330" "ENSG00000107807" "ENSG00000238554"
## [5] "ENSG00000200156" "ENSG00000278996"

2.2.1 DEGs unique in HER2

Find DEGs that only have differential expression in HER2, but not in TNBC or NTNBC.

deg_her2_uniq <- c()

for (gene in deg_her2) {
  if (!(gene %in% deg_tnbc) & !(gene %in% deg_ntnbc)){
    #gene_name <- gene_mapping$Gene.Name[gene_mapping$Gene.ID == gene]
    deg_her2_uniq <- c(deg_her2_uniq, gene)
    # deg_her2_uniq <- c(deg_her2_uniq, gene_name)

  }
}
length(deg_her2_uniq)
## [1] 2829
head(deg_her2_uniq)
## [1] "ENSG00000187268" "ENSG00000250981" "ENSG00000201700" "ENSG00000143556"
## [5] "ENSG00000143194" "ENSG00000183654"

Filter the 2829 DEGs so that padj < 0.1 and abs(log2FoldChang) > 2

deg_her2_uniq_res <- res_HN[(rownames(res_HN) %in% deg_her2_uniq) & (res_HN$padj < 0.1) & (abs(res_HN$log2FoldChange) > 2), ]
deg_her2_uniq_res
## log2 fold change (MLE): Condition HER2 vs H 
## Wald test p-value: Condition HER2 vs H 
## DataFrame with 1572 rows and 6 columns
##                  baseMean log2FoldChange     lfcSE      stat      pvalue
##                 <numeric>      <numeric> <numeric> <numeric>   <numeric>
## ENSG00000005421   6.74571        2.50046  1.087676   2.29890 2.15104e-02
## ENSG00000011258 525.11625        2.30329  0.552918   4.16570 3.10392e-05
## ENSG00000012171 201.82423       -3.87567  0.903037  -4.29181 1.77221e-05
## ENSG00000015520  23.97848       -2.39966  0.666343  -3.60123 3.16713e-04
## ENSG00000018625  57.50529       -3.34748  0.776105  -4.31317 1.60930e-05
## ...                   ...            ...       ...       ...         ...
## ENSG00000285653   3.60513        3.03447  1.141604   2.65808  0.00785881
## ENSG00000285679   3.04704        2.59356  1.064615   2.43615  0.01484445
## ENSG00000285774   5.62329        2.56804  0.864342   2.97110  0.00296738
## ENSG00000285822   5.31911        7.23849  3.186828   2.27138  0.02312415
## ENSG00000285836   4.62876        2.46326  0.961903   2.56082  0.01044246
##                        padj
##                   <numeric>
## ENSG00000005421 0.060007224
## ENSG00000011258 0.000278803
## ENSG00000012171 0.000173412
## ENSG00000015520 0.001949756
## ENSG00000018625 0.000159753
## ...                     ...
## ENSG00000285653   0.0269898
## ENSG00000285679   0.0446907
## ENSG00000285774   0.0122821
## ENSG00000285822   0.0634720
## ENSG00000285836   0.0337976

2.2.2 DEGs unique in TNCB

Find DEGs that only have differential expression in TNBC, but not in HER2 or NTNBC.

deg_tnbc_uniq <- c()

for (gene in deg_tnbc) {
  if (!(gene %in% deg_her2) & !(gene %in% deg_ntnbc)){
    #gene_name <- gene_mapping$Gene.Name[gene_mapping$Gene.ID == gene]
    #deg_tnbc_uniq <- c(deg_tnbc_uniq, gene_name)
    deg_tnbc_uniq <- c(deg_tnbc_uniq, gene)
  }
}
length(deg_tnbc_uniq)
## [1] 5388

Filter the 5388 DEGs so that padj < 0.1 and abs(log2FoldChang) > 2

deg_tnbc_uniq_res <- res_TN[(rownames(res_TN) %in% deg_tnbc_uniq) & (res_TN$padj < 0.1) & (abs(res_TN$log2FoldChange) > 2), ]
deg_tnbc_uniq_res
## log2 fold change (MLE): Condition TNBC vs H 
## Wald test p-value: Condition TNBC vs H 
## DataFrame with 4227 rows and 6 columns
##                  baseMean log2FoldChange     lfcSE      stat      pvalue
##                 <numeric>      <numeric> <numeric> <numeric>   <numeric>
## ENSG00000002745   4.44906        2.55036  0.976344   2.61216 0.008997267
## ENSG00000005073   8.79492        4.07119  1.384853   2.93980 0.003284285
## ENSG00000006059   2.56216        3.30263  1.172900   2.81579 0.004865813
## ENSG00000006282 257.68069       -2.07077  0.697565  -2.96857 0.002991867
## ENSG00000006377   7.24885        4.29806  1.206065   3.56370 0.000365661
## ...                   ...            ...       ...       ...         ...
## ENSG00000285969   2.14834        4.64555   1.45840   3.18538  0.00144566
## ENSG00000285971   3.11366        4.41082   1.46667   3.00737  0.00263519
## ENSG00000285972   1.37696        4.05401   1.81564   2.23283  0.02555994
## ENSG00000285977   1.07755        4.40021   1.62621   2.70581  0.00681386
## ENSG00000285984   2.16402        4.44710   1.70179   2.61319  0.00897020
##                       padj
##                  <numeric>
## ENSG00000002745 0.02702082
## ENSG00000005073 0.01254296
## ENSG00000006059 0.01696070
## ENSG00000006282 0.01166889
## ENSG00000006377 0.00228324
## ...                    ...
## ENSG00000285969 0.00659888
## ENSG00000285971 0.01056817
## ENSG00000285972 0.05852370
## ENSG00000285977 0.02183398
## ENSG00000285984 0.02697130

Find DEGs that only have differential expression in NTNBC, but not in HER2 or TNBC.

deg_ntnbc_uniq <- c()

for (gene in deg_ntnbc) {
  if (!(gene %in% deg_her2) & !(gene %in% deg_tnbc)){
    #gene_name <- gene_mapping$Gene.Name[gene_mapping$Gene.ID == gene]
    #deg_ntnbc_uniq <- c(deg_ntnbc_uniq, gene_name)
    deg_ntnbc_uniq <- c(deg_ntnbc_uniq, gene)
  }
}
length(deg_ntnbc_uniq)
## [1] 1455

Filter the 1455 DEGs so that padj < 0.1 and abs(log2FoldChang) > 2

deg_ntnbc_uniq_res <- res_NN[(rownames(res_NN) %in% deg_ntnbc_uniq) & (res_NN$padj < 0.1) & (abs(res_NN$log2FoldChange) > 2), ]
deg_ntnbc_uniq_res
## log2 fold change (MLE): Condition NTNBC vs H 
## Wald test p-value: Condition NTNBC vs H 
## DataFrame with 498 rows and 6 columns
##                  baseMean log2FoldChange     lfcSE      stat      pvalue
##                 <numeric>      <numeric> <numeric> <numeric>   <numeric>
## ENSG00000005469  346.4683        2.10581  0.643181   3.27406 1.06013e-03
## ENSG00000005513   35.1593       -5.38007  1.613803  -3.33378 8.56735e-04
## ENSG00000006747  103.3118        2.90883  0.629176   4.62323 3.77806e-06
## ENSG00000007062  763.2914       -5.57669  0.852645  -6.54046 6.13308e-11
## ENSG00000008300  121.9057        2.25069  0.594153   3.78807 1.51821e-04
## ...                   ...            ...       ...       ...         ...
## ENSG00000285587   2.30098        3.54377  1.444751   2.45286  0.01417263
## ENSG00000285632   3.40455        2.55228  0.999384   2.55385  0.01065391
## ENSG00000285711   1.54134        5.10511  2.235400   2.28376  0.02238586
## ENSG00000285755   3.36367        3.98576  1.479678   2.69367  0.00706709
## ENSG00000285878   1.55174        3.53415  1.529404   2.31080  0.02084383
##                        padj
##                   <numeric>
## ENSG00000005469 7.04560e-03
## ENSG00000005513 5.93918e-03
## ENSG00000006747 6.62530e-05
## ENSG00000007062 3.38801e-09
## ENSG00000008300 1.49895e-03
## ...                     ...
## ENSG00000285587   0.0508362
## ENSG00000285632   0.0411032
## ENSG00000285711   0.0709822
## ENSG00000285755   0.0303239
## ENSG00000285878   0.0673166

Shrunken log2FoldChange:

#resultsNames(dds_p)
library(apeglm)
resLFC_HN <- lfcShrink(dds_p, coef = 'Condition_HER2_vs_H', type = 'apeglm')
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
resLFC_NN <- lfcShrink(dds_p, coef = 'Condition_NTNBC_vs_H', type = 'apeglm')
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
resLFC_TN <- lfcShrink(dds_p, coef = 'Condition_TNBC_vs_H', type = 'apeglm')
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895

MA-plot of the 3 comparison groups:

# HN
par(mfrow = c(1,2))

p1 <- plotMA(res_HN, ylim = c(-4,4), main = 'Original LFC')
p2 <- plotMA(resLFC_HN, ylim = c(-4,4), main = 'Shrinked LFC')

# TN
par(mfrow = c(1,2))

p1 <- plotMA(res_TN, ylim = c(-4,4), main = 'Original LFC')
p2 <- plotMA(resLFC_TN, ylim = c(-4,4), main = 'Shrinked LFC')

# NN
par(mfrow = c(1,2))

p1 <- plotMA(res_NN, ylim = c(-4,4),  main = 'Original LFC')
p2 <- plotMA(resLFC_NN, ylim = c(-4,4),  main = 'Shrinked LFC')

3. Extract significant genes as features for sample clustering (DESeq2)

Use genes found in section 2 as features to perform sample clustering:

# consolidate the genes
sig_gene <- c(rownames(deg_ntnbc_uniq_res), rownames(deg_tnbc_uniq_res), rownames(deg_her2_uniq_res))
# convert the gene name to gene id
sig_gene <- gene_mapping$Gene.ID[gene_mapping$Gene.Name %in% sig_gene]

# consolidate the gene ids again and remove any duplicate
sig_gene <- c(sig_gene, res_common_H$Row.names)
sig_gene <- unique(sig_gene)
length(sig_gene)
## [1] 8549

Filter the original df to include only the 8549 genes:

# get the rlog transformed gene counts
res_rld <- data.frame(assay(rlog(dds_p)))
# filter by the 195 genes
df_sub <- res_rld[rownames(res_rld) %in% sig_gene,]
head(df_sub)
##                 SRR1027182 SRR1027186 SRR1027185 SRR1027177 SRR1027181
## ENSG00000000457   7.675631   8.739872   8.073235   7.545369   7.790568
## ENSG00000000460   6.669396   7.931435   7.339798   7.254117   6.606724
## ENSG00000001629   9.971772  10.010057  10.696888  10.470821   9.930442
## ENSG00000001630   4.958660   5.358791   5.120947   4.917697   4.265257
## ENSG00000001631   6.950015   6.781347   7.586488   6.648537   6.557669
## ENSG00000002079   3.586119   3.201466   3.030766   2.627879   3.682450
##                 SRR1027175 SRR1027180 SRR1027184 SRR1027190 SRR1027188
## ENSG00000000457   7.771666   7.982595   8.242751   6.671099   6.877995
## ENSG00000000460   6.677899   6.772192   8.047991   5.711017   5.766480
## ENSG00000001629  10.137478  10.050901  10.327020   9.432318   8.980171
## ENSG00000001630   4.498984   4.157702   4.123406   5.900362   6.096478
## ENSG00000001631   6.683149   6.698314   7.058255   6.422217   5.982455
## ENSG00000002079   3.452991   3.816225   2.840214   2.278830   1.563295
##                 SRR1027176 SRR1027189 SRR1027174 SRR1027179 SRR1027178
## ENSG00000000457   7.512042   6.444943   7.212550   7.989886   7.808228
## ENSG00000000460   7.234055   5.850033   6.600967   6.751960   6.952811
## ENSG00000001629  10.016222   9.322862  10.044661   9.940110  10.054160
## ENSG00000001630   5.023281   6.182751   3.974624   4.492462   5.135600
## ENSG00000001631   6.772411   6.143260   6.575057   6.711344   6.676041
## ENSG00000002079   3.559068   2.406595   3.461142   3.219383   3.088671
##                 SRR1027187 SRR1027171 SRR1027173 SRR1027183
## ENSG00000000457   8.133732   7.593573   7.775204   7.895472
## ENSG00000000460   7.323078   7.140027   6.627667   7.280963
## ENSG00000001629   9.961946  10.166826  10.048988  10.037315
## ENSG00000001630   4.336180   4.679490   4.606548   4.921191
## ENSG00000001631   6.420074   6.742065   6.447810   6.462251
## ENSG00000002079   3.787569   4.821001   5.144111   2.589992

As the 8549 genes might have colinearity (e.g. coexpression of genes), will perform PCA and use PCs as features to reduce colinearity.

rld_pca <- prcomp(t(df_sub), scale = T, center = T)
summary(rld_pca)
## Importance of components:
##                            PC1      PC2      PC3      PC4      PC5     PC6
## Standard deviation     66.7097 26.66731 22.45152 21.37077 19.12686 15.9356
## Proportion of Variance  0.5205  0.08318  0.05896  0.05342  0.04279  0.0297
## Cumulative Proportion   0.5205  0.60373  0.66270  0.71612  0.75891  0.7886
##                             PC7      PC8      PC9    PC10     PC11     PC12
## Standard deviation     15.24610 14.43707 13.81644 13.1741 12.87425 12.53418
## Proportion of Variance  0.02719  0.02438  0.02233  0.0203  0.01939  0.01838
## Cumulative Proportion   0.81581  0.84019  0.86252  0.8828  0.90221  0.92058
##                            PC13     PC14     PC15     PC16    PC17    PC18
## Standard deviation     12.33131 11.57145 11.27389 10.98224 8.73020 8.30993
## Proportion of Variance  0.01779  0.01566  0.01487  0.01411 0.00892 0.00808
## Cumulative Proportion   0.93837  0.95403  0.96890  0.98301 0.99192 1.00000
##                             PC19
## Standard deviation     7.332e-14
## Proportion of Variance 0.000e+00
## Cumulative Proportion  1.000e+00

Choose the first 14 PCs with 95% of variance coverage as features for clustering

post_pca = rld_pca$x[,1:14]

Hierachical clustering

set.seed(127)
#comput the distance bewteen the samples using euclidean distance
dist <- dist(post_pca, method = 'euclidean')

# hierachical clustering using Ward's criterion
hclust_ward <- hclust(dist, method = 'ward.D2')
fviz_dend(hclust_ward, repel = TRUE)

# select the number of clusters
fviz_nbclust(post_pca, FUNcluster = hcut, method = 'silhouette')

# K = 3
# cut the dendrogram with k = 3
cut_ward <- cutree(hclust_ward, k = 2)

# plot the dengrogram with 3 clusters
fviz_dend(hclust_ward, k = 2, rect = TRUE, color_labels_by_k = TRUE, type = "rectangle", show_labels = TRUE)

# plot the cluster in PCA
fviz_cluster(object = list(data = post_pca, cluster = cut_ward), stand = FALSE, geom = c("point", "text"), repel = T, main = 'HCL clustering')

Plot the original subtypes on PCA plot for comparison:

fviz_pca_ind(rld_pca, geom = c("point", "text"), labelsize = 3, habillage = meta$Condition, title = 'PCA - Categorized by cancer subtype labels', repel = T)

The features fail to recognize the difference among the cancer subtypes. Cluster 1 contains samples of HER2, TNCB and NTNBC, whereas Cluster 2 contains only Healthy samples.

This is not surprising as the DEGs were identified by comparing cancer samples against healthy samples, hence, have good performance in differentiating healthy samples from cancer ones, but not so good when differentiate among different cancer subtypes.

4. Differential Expression Analysis using Limma-Voom

Setup

library(edgeR)
## Loading required package: limma
## 
## Attaching package: 'limma'
## The following object is masked from 'package:DESeq2':
## 
##     plotMA
## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA
# limma is loaded as dependency
# create DGEList object
d1 <- DGEList(df)
# calculate normalization factors
d0 <- calcNormFactors(d1)
d0
## An object of class "DGEList"
## $counts
##                 SRR1027182 SRR1027186 SRR1027185 SRR1027177 SRR1027181
## ENSG00000000003         55         97        321        507        116
## ENSG00000000005          2          2          2          1          3
## ENSG00000000419        179        162        224       1219        561
## ENSG00000000457        147        393        257        298        270
## ENSG00000000460         68        225        158        287        104
##                 SRR1027175 SRR1027180 SRR1027184 SRR1027190 SRR1027188
## ENSG00000000003        377        167        435        975        933
## ENSG00000000005          2         18          1         24         22
## ENSG00000000419        403        183        921        392        398
## ENSG00000000457        189        267        389        114        128
## ENSG00000000460         80        101        399         53         50
##                 SRR1027176 SRR1027189 SRR1027174 SRR1027179 SRR1027178
## ENSG00000000003        142        972        253         98        207
## ENSG00000000005        100         41          3          9         11
## ENSG00000000419        311        344        252        433        328
## ENSG00000000457        141         86        104        310        243
## ENSG00000000460        138         63         73        114        132
##                 SRR1027187 SRR1027171 SRR1027173 SRR1027183
## ENSG00000000003        229        235        257        257
## ENSG00000000005          0          7         11          2
## ENSG00000000419        130        516        340        242
## ENSG00000000457        128        247        240        208
## ENSG00000000460         73        202         96        144
## 58730 more rows ...
## 
## $samples
##            group lib.size norm.factors
## SRR1027182     1  5218431    0.9582419
## SRR1027186     1  6326462    0.8219818
## SRR1027185     1  6290938    1.0214663
## SRR1027177     1 11938349    0.8634171
## SRR1027181     1  7849877    1.1064333
## 14 more rows ...

Extract group info from meta in the same order as sample sequence in df

meta <- meta[match(colnames(df), meta$Run),]
group <- as.factor(meta$Condition)

Filter low-expressed genes

keep.exprs <- filterByExpr(d0, group=group)
d <- d0[keep.exprs,, keep.lib.sizes=FALSE]
dim(d)
## [1] 21023    19
# plot the distribution of genes before/after filtration
cpm <- cpm(d0)
lcpm <- cpm(d0, log=TRUE)

L <- mean(d0$samples$lib.size) * 1e-6
M <- median(d0$samples$lib.size) * 1e-6

lcpm.cutoff <- log2(10/M + 2/L)
library(RColorBrewer)
nsamples <- ncol(d0)
col <- colorRampPalette(brewer.pal(8, "Paired"))(19)
par(mfrow=c(1,2))
plot(density(lcpm[,1]), col=col[1], lwd=2, ylim=c(0,0.26), las=2, main="", xlab="")
title(main="A. Raw data", xlab="Log-cpm")
abline(v=lcpm.cutoff, lty=3)
for (i in 2:nsamples){
  den <- density(lcpm[,i])
  lines(den$x, den$y, col=col[i], lwd=2)
}
# legend("topright", samplenames, text.col=col, bty="n")
lcpm <- cpm(d, log=TRUE)
plot(density(lcpm[,1]), col=col[1], lwd=2, ylim=c(0,0.26), las=2, main="", xlab="")
title(main="B. Filtered data", xlab="Log-cpm")
abline(v=lcpm.cutoff, lty=3)
for (i in 2:nsamples){
  den <- density(lcpm[,i])
  lines(den$x, den$y, col=col[i], lwd=2)
}

#legend("topright", samplenames, text.col=col, bty="n")

Unsupervised clustering of samples:

lcpm <- cpm(d, log=TRUE)

col.group <- group
levels(col.group) <- brewer.pal(nlevels(col.group), "Set1")
col.group <- as.numeric(col.group)
plotMDS(lcpm, labels=group, col=col.group)
title(main="Sample groups")

Specify the model to be fitted before using voom, as voom uses variances of the model residuals:

mm <- model.matrix(~ 0 + group)
mm
##    groupH groupHER2 groupNTNBC groupTNBC
## 1       0         0          1         0
## 2       0         1          0         0
## 3       0         1          0         0
## 4       0         0          1         0
## 5       0         0          1         0
## 6       0         0          0         1
## 7       0         0          1         0
## 8       0         1          0         0
## 9       1         0          0         0
## 10      1         0          0         0
## 11      0         0          0         1
## 12      1         0          0         0
## 13      0         0          0         1
## 14      0         0          1         0
## 15      0         0          1         0
## 16      0         1          0         0
## 17      0         0          0         1
## 18      0         0          0         1
## 19      0         1          0         0
## attr(,"assign")
## [1] 1 1 1 1
## attr(,"contrasts")
## attr(,"contrasts")$group
## [1] "contr.treatment"
# model where each coefficient corresponds to a group mean
y <- voom(d, mm, plot=T)

y
## An object of class "EList"
## $targets
##            group lib.size norm.factors
## SRR1027182     1  4976088    0.9582419
## SRR1027186     1  5175924    0.8219818
## SRR1027185     1  6391829    1.0214663
## SRR1027177     1 10271500    0.8634171
## SRR1027181     1  8630457    1.1064333
## 14 more rows ...
## 
## $E
##                 SRR1027182 SRR1027186 SRR1027185 SRR1027177 SRR1027181
## ENSG00000000003  3.4794035   4.235514   5.652446   5.626689   3.754749
## ENSG00000000005 -0.9930842  -1.049889  -1.354301  -2.775613  -1.302082
## ENSG00000000419  5.1728277   4.972479   5.134343   6.891499   6.023705
## ENSG00000000457  4.8895588   6.248403   5.332200   4.861012   4.970048
## ENSG00000000460  3.7830198   5.445167   4.632110   4.806843   3.597922
##                 SRR1027175 SRR1027180 SRR1027184 SRR1027190 SRR1027188
## ENSG00000000003   6.006495   4.547798   5.791883   6.486268   6.646000
## ENSG00000000005  -1.231910   1.369234  -2.389683   1.170980   1.271347
## ENSG00000000419   6.102587   4.679417   6.873194   5.172819   5.417930
## ENSG00000000457   5.012216   5.223176   5.630834   3.395474   3.785119
## ENSG00000000460   3.777079   3.825117   5.667406   2.297737   2.437706
##                 SRR1027176 SRR1027189 SRR1027174 SRR1027179 SRR1027178
## ENSG00000000003   4.552505   6.496901  5.4690694  3.6173405  4.7548609
## ENSG00000000005   4.048738   1.946386 -0.7094176  0.2432162  0.5814553
## ENSG00000000419   5.680775   4.999707  5.4633670  5.7551768  5.4176430
## ENSG00000000457   4.542345   3.005975  4.1905866  5.2737381  4.9856714
## ENSG00000000460   4.511429   2.560031  3.6828998  3.8344925  4.1077419
##                 SRR1027187 SRR1027171 SRR1027173 SRR1027183
## ENSG00000000003   6.236275  4.4972091  4.7689844   5.366278
## ENSG00000000005  -2.606075 -0.4754836  0.2841178  -1.320222
## ENSG00000000419   5.421831  5.6302504  5.1720668   5.279691
## ENSG00000000457   5.399550  4.5689106  4.6704489   5.061753
## ENSG00000000460   4.593597  4.2794039  3.3530129   4.532775
## 21018 more rows ...
## 
## $weights
##          [,1]      [,2]     [,3]      [,4]      [,5]      [,6]      [,7]
## [1,] 2.665116 3.4449330 3.582378 3.3538089 3.2030003 3.3280929 3.0281307
## [2,] 0.710988 0.5127462 0.555782 0.9566262 0.8898671 0.9722804 0.8240998
## [3,] 3.513644 3.4855322 3.612227 3.8267638 3.7761743 3.5939101 3.7078005
## [4,] 3.167700 3.4846308 3.611566 3.6752517 3.5863225 3.0375768 3.4681485
## [5,] 2.457642 3.1659581 3.349798 3.1702793 3.0039508 2.5702972 2.8191282
##           [,8]     [,9]    [,10]     [,11]    [,12]    [,13]     [,14]
## [1,] 3.6869238 3.915124 3.906053 3.3559976 3.914611 3.306952 3.1363253
## [2,] 0.6022071 1.679017 1.582233 0.9860005 1.672285 0.961879 0.8636138
## [3,] 3.7115164 3.746078 3.682527 3.6120717 3.742037 3.580100 3.7516831
## [4,] 3.7110498 2.823625 2.668006 3.0688949 2.813226 3.012001 3.5426391
## [5,] 3.5037948 2.183988 2.054648 2.6036944 2.174986 2.545122 2.9331600
##          [,15]     [,16]    [,17]    [,18]     [,19]
## [1,] 3.0948440 2.9770661 3.690146 3.644642 3.5689387
## [2,] 0.8483016 0.4464249 1.235130 1.186212 0.5507552
## [3,] 3.7349055 3.0318457 3.822162 3.794827 3.5994469
## [4,] 3.5154785 3.0306246 3.516904 3.449000 3.5987884
## [5,] 2.8905395 2.6435873 3.137409 3.043663 3.3301958
## 21018 more rows ...
## 
## $design
##   groupH groupHER2 groupNTNBC groupTNBC
## 1      0         0          1         0
## 2      0         1          0         0
## 3      0         1          0         0
## 4      0         0          1         0
## 5      0         0          1         0
## 14 more rows ...
contr.matrix <- makeContrasts(
   HC = groupHER2 - groupH, 
   TC = groupTNBC - groupH, 
   NC = groupNTNBC - groupH, 
   levels = colnames(mm))
contr.matrix
##             Contrasts
## Levels       HC TC NC
##   groupH     -1 -1 -1
##   groupHER2   1  0  0
##   groupNTNBC  0  0  1
##   groupTNBC   0  1  0
# HT = groupHER2 - groupTNBC,
# HN = groupHER2 - groupNTNBC,
# TN = groupTNBC - groupNTNBC,
fit <- lmFit(y, mm)
vfit <- contrasts.fit(fit, contrasts=contr.matrix)
efit <- eBayes(vfit)
#efit <- eBayes(fit)
plotSA(efit, main="Final model: Mean-variance trend")

Examine the number of DE genes

dt <- decideTests(efit, p.value = 0.1, lfc = 2)
summary(dt)
##           HC    TC    NC
## Down    2619  1978  1921
## NotSig 15533 15721 16077
## Up      2871  3324  3025
dt
## TestResults matrix
##                  Contrasts
##                   HC TC NC
##   ENSG00000000003  0  0 -1
##   ENSG00000000005 -1  0  0
##   ENSG00000000419  0  0  0
##   ENSG00000000457  1  0  0
##   ENSG00000000460  1  0  0
## 21018 more rows ...
de.common <- rownames(dt)[which(dt[,1]!=0 & dt[,2]!=0 & dt[,3]!=0)]
length(de.common)
## [1] 2874
# graph of all DEGs
vennDiagram(dt[,1:3], circle.col = c('turquoise', 'salmon', 'bisque4'))
title(main="All DEGs")

# graph of all up-regulated DEGs
dt_up <- dt[dt[,1] == 1 | dt[,2]==1 | dt[,3]==1,]
vennDiagram(dt_up[,1:3], circle.col = c('turquoise', 'salmon', 'bisque3'))
title(main="Up-regulated DEGs")

# graph of all down-regulated DEGs
dt_down <- dt[dt[,1] == -1 | dt[,2]== -1 | dt[,3]== -1,]
vennDiagram(dt_down[,1:3], circle.col = c('turquoise', 'salmon', 'bisque3'))
title(main="Down-regulated DEGs")

5. Sample Clustering using DEGs identified (by Limma-Voom)

Filter the original df to include only the de.common genes:

# get the rlog transformed gene counts
res_rld_voom <- data.frame(rlog(d$counts))
# filter by the 195 genes
df_sub_voom <- res_rld_voom[rownames(res_rld_voom) %in% de.common,]
head(df_sub_voom)
##                 SRR1027182 SRR1027186 SRR1027185 SRR1027177 SRR1027181
## ENSG00000001630   4.958247   5.343856   5.114406   4.917008   4.300493
## ENSG00000002079   3.569527   3.234971   3.087680   2.749018   3.654611
## ENSG00000003096   5.175175   5.353186   4.684327   5.903299   5.913090
## ENSG00000004468   6.535455   6.608274   8.371350   5.709817   5.211497
## ENSG00000004776   3.258560   3.881058   2.996423   2.825765   3.165029
## ENSG00000004799   8.515097   9.356760   9.066982   8.384969   8.101930
##                 SRR1027175 SRR1027180 SRR1027184 SRR1027190 SRR1027188
## ENSG00000001630   4.522198   4.201337   4.168541   5.861037   6.054485
## ENSG00000002079   3.452448   3.776555   2.927799   2.468231   1.942604
## ENSG00000003096   7.045040   5.529632   6.560616   7.779031   7.846030
## ENSG00000004468   6.154600   5.568917   7.581691   4.620612   4.385451
## ENSG00000004776   3.518001   5.235172   2.482809   6.911212   6.751311
## ENSG00000004799   8.899806   9.564152   7.154566  11.896385  11.374678
##                 SRR1027176 SRR1027189 SRR1027174 SRR1027179 SRR1027178
## ENSG00000001630   5.019956   6.138668   4.025314   4.516716   5.129874
## ENSG00000002079   3.545421   2.568839   3.458071   3.249562   3.137770
## ENSG00000003096   6.065843   7.199163   5.600636   6.258848   5.181567
## ENSG00000004468   7.251702   4.258206   6.512728   5.594417   5.135410
## ENSG00000004776   4.966721   6.434612   3.415011   3.838003   3.012564
## ENSG00000004799   7.779195  11.375002   8.473789   7.950878   8.167929
##                 SRR1027187 SRR1027171 SRR1027173 SRR1027183
## ENSG00000001630   4.370189   4.697568   4.626413   4.925131
## ENSG00000002079   3.749386   4.701181   5.000301   2.721378
## ENSG00000003096   6.229182   6.016643   5.417350   6.107114
## ENSG00000004468   5.620001   6.711646   6.103910   7.673700
## ENSG00000004776   3.996636   4.080438   3.240130   3.641810
## ENSG00000004799  10.122890   8.025655   8.220681   8.409339

As the 2875 genes might have colinearity (e.g. coexpression of genes), will perform PCA and use PCs as features to reduce colinearity.

rld_pca_voom <- prcomp(t(df_sub_voom), scale = T, center = T)
summary(rld_pca_voom)
## Importance of components:
##                            PC1      PC2      PC3      PC4      PC5     PC6
## Standard deviation     41.3894 15.29290 12.57930 11.68686 10.04593 8.46917
## Proportion of Variance  0.5961  0.08138  0.05506  0.04752  0.03512 0.02496
## Cumulative Proportion   0.5961  0.67744  0.73250  0.78002  0.81514 0.84009
##                            PC7     PC8     PC9    PC10    PC11    PC12    PC13
## Standard deviation     7.93545 7.28620 6.91457 6.79487 6.45732 6.35105 5.98040
## Proportion of Variance 0.02191 0.01847 0.01664 0.01606 0.01451 0.01403 0.01244
## Cumulative Proportion  0.86200 0.88048 0.89711 0.91318 0.92768 0.94172 0.95416
##                          PC14    PC15    PC16    PC17    PC18     PC19
## Standard deviation     5.9206 5.65906 5.36315 4.32398 4.14683 2.74e-14
## Proportion of Variance 0.0122 0.01114 0.01001 0.00651 0.00598 0.00e+00
## Cumulative Proportion  0.9664 0.97750 0.98751 0.99402 1.00000 1.00e+00

Choose the first 13 PCs with 95% of variance coverage as features for clustering

post_pca_voom = rld_pca_voom$x[,1:13]

Hierachical clustering

set.seed(127)
#comput the distance bewteen the samples using euclidean distance
dist <- dist(post_pca_voom, method = 'euclidean')

# hierachical clustering using Ward's criterion
hclust_ward <- hclust(dist, method = 'mcquitty')
fviz_dend(hclust_ward, repel = TRUE)

# select the number of clusters
fviz_nbclust(post_pca_voom, FUNcluster = hcut, method = 'silhouette')

# K = 2
# cut the dendrogram with k = 2
cut_ward <- cutree(hclust_ward, k = 2)

# plot the dengrogram with 3 clusters
fviz_dend(hclust_ward, k = 2, rect = TRUE, color_labels_by_k = TRUE, type = "rectangle", show_labels = TRUE)

# plot the cluster in PCA
fviz_cluster(object = list(data = post_pca_voom, cluster = cut_ward), stand = FALSE, geom = c("point", "text"), repel = T, main = 'HCL clustering')

Plot the original subtypes on PCA plot for comparison:

fviz_pca_ind(rld_pca_voom, geom = c("point", "text"), labelsize = 3, habillage = meta$Condition, title = 'PCA - Categorized by cancer subtype labels', repel = T)

6. Overlapping DEGs from DESeq2 and Limma-Voom

# common DEGs among 3 subtypes:
length(Reduce(intersect, list(de.common, deseq_lfc)))
## [1] 1674
#HER2 -voom with lfc filter
her2_v <- rownames(dt)[dt[,1] != 0 & dt[,2] == 0 & dt[,3] == 0]
length(her2_v)
## [1] 1330
# HER2 -DESEq2 with lfc filter
length(intersect(rownames(deg_her2_uniq_res), her2_v)) # overlap with LFC filter of 2
## [1] 311
#TNBC
tnbc_v <- rownames(dt)[dt[,2] != 0 & dt[,1] == 0 & dt[,3] == 0]
length(tnbc_v)
## [1] 826
length(intersect(rownames(deg_tnbc_uniq_res), tnbc_v))
## [1] 203
#NTNBC
ntnbc_v <- rownames(dt)[dt[,3] != 0 & dt[,1] == 0 & dt[,2] == 0]
length(ntnbc_v)
## [1] 532
length(intersect(rownames(deg_ntnbc_uniq_res), ntnbc_v))
## [1] 148